import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from scipy.stats import zscore
cData = pd.read_csv("Vehicle.csv")
cData.shape
cData.head()
cData.describe().transpose()
cData.dtypes
cData.isnull().values.any()
cData.isnull().sum()
#Replace mising values
cData = cData.replace(' ', np.nan)
for i in cData.columns[:17]:
m = cData[i].median()
cData[i] = cData[i].fillna(m)
cData.isnull().values.any()
Since variable 'Class' is category type:
cData['class']=cData['class'].astype('category')
cData.info()
cData['class'].value_counts()
sns.countplot(cData['class'])
cData.hist(figsize=(18,18))
#Outliers
g = cData.boxplot(figsize=(15,15))
g.set_xticklabels(plt.xticks()[1], rotation=60)
for col_name in cData.columns[:-1]:
q1 = cData[col_name].quantile(0.25)
q3 = cData[col_name].quantile(0.75)
iqr = q3 - q1
low = q1-1.5*iqr
high = q3+1.5*iqr
cData.loc[(cData[col_name] < low) | (cData[col_name] > high), col_name] = cData[col_name].median()
g = cData.boxplot(figsize=(15,15))
g.set_xticklabels(plt.xticks()[1], rotation=60)
labelencoder_X = LabelEncoder()
cData['class'] = labelencoder_X.fit_transform(cData['class'])
cData.corr()
def plot_corr(df, size=14, title='Correlation'):
sns.set(font_scale=1.15)
fig,ax=plt.subplots(figsize=(size,size))
g = sns.heatmap(df.corr(),vmin=0.5, annot=True, linewidths=0.01,center=1,linecolor="white",cbar=False,square=True)
bottom, top = g.get_ylim()
g.set_ylim(bottom + 0.5, top - 0.5)
g.set_xticklabels(plt.xticks()[1], rotation=60)
plt.title(title,fontsize=12)
ax.tick_params(labelsize=14)
plot_corr(cData)
High Positive corelation exist between scatter_ratio, pr.axis_rectangularity, scaled_variance1, distance_circularity, scaled_variance, radius_ratio.
High negative corelation exist with elongatedness.
We can clrealy see that following features have very low corelation, as compared to others: pr.pr.axis_aspect_ratio, max.length_aspect_ratio, scaled_radius_of_gyration.1, skewness_about, skewness_about.1, skewness_about.2, hollows_ratio.
sns.pairplot(cData, hue='class')
It clearly tell us: High Positive corelation exist between scatter_ratio, pr.axis_rectangularity, scaled_variance1, distance_circularity, scaled_variance, radius_ratio.
High negative corelation exist with elongatedness.
We can clrealy see that following features have very low corelation, as compared to others: pr.pr.axis_aspect_ratio, max.length_aspect_ratio, scaled_radius_of_gyration.1, skewness_about, skewness_about.1, skewness_about.2, hollows_ratio.
x = cData.iloc[:,0:18]
y = cData.iloc[:,18]
# Split x and y into training and test set in 70:30 ratio
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 10)
LogisticModel = LogisticRegression()
LogisticModel.fit(x_train, y_train)
LogisticModelPrediction = LogisticModel.predict(x_test)
model_score_Logistic = LogisticModel.score(x_test, y_test)
print(model_score_Logistic)
print('Accuracy on Training data: ',LogisticModel.score(x_train, y_train))
print('Accuracy on Testing data: ',LogisticModel.score(x_test , y_test))
print('Recall value: ',metrics.recall_score(y_test, LogisticModelPrediction, average='macro'))
print('Precision value: ',metrics.precision_score(y_test, LogisticModelPrediction, average='macro'))
print("Confusion Matrix:\n", metrics.confusion_matrix(y_test, LogisticModelPrediction))
print("classification Report:\n", metrics.classification_report(y_test, LogisticModelPrediction))
resultsDf1 = pd.DataFrame({'Model':['Logistic'],'Accuracy': LogisticModel.score(x_test , y_test)},index={'1'})
resultsDf1 = resultsDf1[['Model','Accuracy']]
resultsDf1
GaussianNBModel = GaussianNB()
GaussianNBModel.fit(x_train, y_train)
GaussianNBModelPrediction = GaussianNBModel.predict(x_test)
model_score_GaussianNB = GaussianNBModel.score(x_test, y_test)
print(model_score_GaussianNB)
print('Accuracy on Training data: ',GaussianNBModel.score(x_train, y_train))
print('Accuracy on Testing data: ', GaussianNBModel.score(x_test , y_test))
print('Recall value: ', metrics.recall_score(y_test, GaussianNBModelPrediction, average='macro'))
print('Precision value: ', metrics.precision_score(y_test, GaussianNBModelPrediction, average='macro'))
print("Confusion Matrix:\n", metrics.confusion_matrix(GaussianNBModelPrediction, y_test))
print("Classification Report:\n", metrics.classification_report(GaussianNBModelPrediction, y_test))
#Store the accuracy results for each kernel in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Model':['Naive Bayes'], 'Accuracy': GaussianNBModel.score(x_test, y_test)},index={'2'})
resultsDf1 = pd.concat([resultsDf1, tempResultsDf])
resultsDf1 = resultsDf1[['Model','Accuracy']]
resultsDf1
svmClassifier = SVC()
svmClassifier = svmClassifier.fit(x_train, y_train)
svmClassifierPrediction = svmClassifier.predict(x_test)
print('Accuracy on Training data: ',svmClassifier.score(x_train, y_train))
print('Accuracy on Testing data: ', svmClassifier.score(x_test , y_test))
print('Recall value: ',metrics.recall_score(y_test, svmClassifierPrediction, average='macro'))
print('Precision value: ',metrics.precision_score(y_test, svmClassifierPrediction, average='macro'))
print("Confusion Matrix:\n",metrics.confusion_matrix(svmClassifierPrediction,y_test))
print("Classification Report:\n",metrics.classification_report(svmClassifierPrediction,y_test))
tempResultsDf = pd.DataFrame({'Model':['SVM'], 'Accuracy': svmClassifier.score(x_test, y_test)},index={'3'})
resultsDf1 = pd.concat([resultsDf1, tempResultsDf])
resultsDf1 = resultsDf1[['Model','Accuracy']]
resultsDf1
#Use the Naive Bayes CLassifier with k fold cross validation
scores = cross_val_score(svmClassifier, cData, y, cv=18)
print(scores)
print('Average score: ', np.mean(scores))
#Store the accuracy results for each kernel in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Model':['SVM k fold'], 'Accuracy': np.mean(scores)}, index={'3'})
resultsDf1 = pd.concat([resultsDf1, tempResultsDf])
resultsDf1 = resultsDf1[['Model','Accuracy']]
resultsDf1
XScaled=x.apply(zscore)
XScaled.head()
plt.rcParams['figure.figsize']=(10,6)
plt.plot(cData)
plt.show()
plt.rcParams['figure.figsize']=(10,6)
plt.plot(XScaled)
plt.show()
# Calculating the covariance between attributes after scaling
covMatrix = np.cov(XScaled.T, rowvar=True)
print('Covariance Matrix \n', covMatrix)
eigenvalues, eigenvectors = np.linalg.eig(covMatrix)
print('Eigen Vectors \n%s', eigenvectors)
print('\n Eigen Values \n%s', eigenvalues)
# Step 3 (continued): Sort eigenvalues in descending order
# Make a set of (eigenvalue, eigenvector) pairs
eig_pairs = [(eigenvalues[index], eigenvectors[:,index]) for index in range(len(eigenvalues))]
# Sort the (eigenvalue, eigenvector) pairs from highest to lowest with respect to eigenvalue
eig_pairs.sort()
eig_pairs.reverse()
print(eig_pairs)
# Extract the descending ordered eigenvalues and eigenvectors
eigvalues_sorted = [eig_pairs[index][0] for index in range(len(eigenvalues))]
eigvectors_sorted = [eig_pairs[index][1] for index in range(len(eigenvalues))]
# Let's confirm our sorting worked, print out eigenvalues
print('Eigenvalues in descending order: \n%s' %eigvalues_sorted)
tot = sum(eigenvalues)
var_explained = [( i /tot ) * 100 for i in sorted(eigenvalues, reverse=True)]
cum_var_exp = np.cumsum(var_explained)
print("Cumulative Variance Explained", cum_var_exp)
plt.plot(var_explained)
# Ploting
plt.figure(figsize=(8 , 7))
plt.bar(range(1, eigenvalues.size + 1), var_explained, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, eigenvalues.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
# Reducing from 17 to 10 dimension space
pca = PCA(n_components=10)
data_reduced = pca.fit_transform(XScaled)
data_reduced.transpose()
pca.components_
X_comp = pd.DataFrame(pca.components_,columns=list(XScaled))
X_comp.head()
P_reduce = np.array(eigenvectors[0:10])
# projecting original data into principal component dimensions
X_std_10D = np.dot(XScaled,P_reduce.T)
# converting array to dataframe for pairplot
Proj_data_df = pd.DataFrame(X_std_10D)
sns.pairplot(Proj_data_df, diag_kind='kde')
# Split X and y into training and test set in 70:30 ratio
x_train1, x_test1, y_train1, y_test1 = train_test_split(Proj_data_df,y, test_size = 0.3, random_state = 10)
LogisticModel1 = LogisticRegression()
LogisticModel1.fit(x_train1, y_train1)
LogisticModelPrediction1 = LogisticModel1.predict(x_test1)
model_score_Logistic1 = LogisticModel1.score(x_test1, y_test1)
print(model_score_Logistic1)
print('Accuracy on Training data: ',LogisticModel1.score(x_train1, y_train1))
print('Accuracy on Testing data: ',LogisticModel1.score(x_test1 , y_test1))
print('Recall value: ',metrics.recall_score(y_test1, LogisticModelPrediction1, average='macro'))
print('Precision value: ',metrics.precision_score(y_test1, LogisticModelPrediction1, average='macro'))
print("Confusion Matrix:\n", metrics.confusion_matrix(y_test1, LogisticModelPrediction1))
print("classification Report:\n", metrics.classification_report(y_test1, LogisticModelPrediction1))
resultsDf2=pd.DataFrame({'Model':['Logistic'],'Accuracy': LogisticModel1.score(x_test1 , y_test1)},index={'1'})
resultsDf2=resultsDf2[['Model','Accuracy']]
resultsDf2
GaussianNBModel1 = GaussianNB()
GaussianNBModel1.fit(x_train1, y_train1)
GaussianNBModelPrediction1 = GaussianNBModel1.predict(x_test1)
model_score_GaussianNB1 = GaussianNBModel1.score(x_test1, y_test1)
print(model_score_GaussianNB1)
print('Accuracy on Training data: ',GaussianNBModel1.score(x_train1, y_train1))
print('Accuracy on Testing data: ', GaussianNBModel1.score(x_test1, y_test1))
print('Recall value: ', metrics.recall_score(y_test1, GaussianNBModelPrediction1, average='macro'))
print('Precision value: ', metrics.precision_score(y_test1, GaussianNBModelPrediction1, average='macro'))
print("Confusion Matrix:\n", metrics.confusion_matrix(GaussianNBModelPrediction1, y_test1))
print("Classification Report:\n", metrics.classification_report(GaussianNBModelPrediction1, y_test1))
#Store the accuracy results for each kernel in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Model':['Naive Bayes'], 'Accuracy': GaussianNBModel1.score(x_test1, y_test1)},index={'2'})
resultsDf2 = pd.concat([resultsDf2, tempResultsDf])
resultsDf2 = resultsDf2[['Model','Accuracy']]
resultsDf2
svmClassifier1 = SVC()
svmClassifier1 = svmClassifier1.fit(x_train1, y_train1)
svmClassifierPrediction1 = svmClassifier1.predict(x_test1)
print('Accuracy on Training data: ',svmClassifier1.score(x_train1, y_train1))
print('Accuracy on Testing data: ', svmClassifier1.score(x_test1 , y_test1))
print('Recall value: ',metrics.recall_score(y_test1, svmClassifierPrediction1, average='macro'))
print('Precision value: ',metrics.precision_score(y_test1, svmClassifierPrediction1, average='macro'))
print("Confusion Matrix:\n",metrics.confusion_matrix(svmClassifierPrediction1,y_test1))
print("Classification Report:\n",metrics.classification_report(svmClassifierPrediction1,y_test1))
tempResultsDf = pd.DataFrame({'Model':['SVM'], 'Accuracy': svmClassifier1.score(x_test1, y_test1)},index={'3'})
resultsDf2 = pd.concat([resultsDf2, tempResultsDf])
resultsDf2 = resultsDf2[['Model','Accuracy']]
resultsDf2
#Use the Naive Bayes CLassifier with k fold cross validation
scores = cross_val_score(svmClassifier1, Proj_data_df, y, cv=18)
print(scores)
print('Average score: ', np.mean(scores))
#Store the accuracy results for each kernel in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Model':['SVM k fold'], 'Accuracy': np.mean(scores)}, index={'3'})
resultsDf2 = pd.concat([resultsDf2, tempResultsDf])
resultsDf2 = resultsDf2[['Model','Accuracy']]
print("Before PCA:")
resultsDf1
print("After PCA:")
resultsDf2